Heatmap function

create_gene_heatmap <- function(de_results_df, dge_object, font_size = 12, num_genes = 20) {
  # Subset the genes
  top_genes <- de_results_df$Symbol[1:num_genes]
  subset_matrix <- dge_object$logCPM[top_genes, colnames(dge_object$counts)]
  
  # Create annotation dataframe
  sample_info <- dge_object$samples
  annotation_df <- data.frame(
    Dose = sample_info$Dose
  )
  rownames(annotation_df) <- colnames(subset_matrix)
  
  # Reorder gene expression matrix
  ordered_samples <- order(sample_info$Dose)
  subset_matrix <- subset_matrix[, ordered_samples]
  sample_info <- sample_info[ordered_samples, ]
  
  # Create the heatmap
  heatmap <- pheatmap(subset_matrix,
           scale = "row",
           cluster_rows = FALSE,
           cluster_cols = FALSE,
           color = viridis(50),
           show_colnames = FALSE,
           annotation_col = annotation_df,
           fontsize = font_size,
           silent = FALSE)
}

Recap

These are the first experiments where I use version 2 chemistry. Read 2 is now the barcode and UMI. Read 1 is the cDNA. This was done to avoid reading through the TSO on Ilumina read 1

This sequencing run consists of 3 experiments:

Test version 2 protocol with a biological application from Zac Moore of Brain Cancer lab https://au-mynotebook.labarchives.com/share/Piper_Research_Project/MzEuMjAwMDAwMDAwMDAwMDAzfDE3MTc2NS8yNC9UcmVlTm9kZS8zMzczNDM0NjE1fDc5LjE5OTk5OTk5OTk5OTk5

Notebook aim

Differential expression testing. Focus on the genes that vary between doses.

Read SCE and extract col data

This object was generated in 2A_Clustering.
Subset only day 3 timepoint for simplicity

sce <- readRDS(here::here(
  "data/TIRE_brain_human/SCEs/", "brainCancer_subset.sce.rds"
))

dge <- scran::convertTo(sce, type="edgeR")

day <- 3
dge <- dge[,dge$samples$Day_Exposure %in% day]

tb <- as_tibble(dge$samples)

Convert the numerical dose column to a factor. For simplicity sake keep the doses with rounded concentrations.

tb$Dose <- as.factor(tb$Dose_M)
tb$Dose <- recode(tb$Dose,
                  "0" = "DMSO",
                  "1e-04" = "100uM",
                  "1e-07" = "100nM",
                  "1e-06" = "1uM",
                  "1e-05" = "10uM")
dge$samples$Dose <- tb$Dose

dge <- dge[,dge$samples$Dose %in% c("DMSO", "10uM", "100uM", "1uM", "100nM")]
dge$samples$Dose <- droplevels(dge$samples$Dose)

Differential expression testing

Have a look at the important metadata.

tb %>% 
  dplyr::count(Drug, Day_Exposure, Dose) %>% 
  arrange(Day_Exposure, Dose)

Remove genes that are lowly expressed. 12,000 genes are kept

keep <- filterByExpr(dge, group=dge$samples$Drug, min.count=1)
dge <- dge[keep,]
summary(keep)
##    Mode   FALSE    TRUE 
## logical   12599   11653

Correct for composition biases by computing normalization factors with the trimmed mean of M-values method.

dge <- calcNormFactors(dge)

Perform differential expression testing

Model matrix generation.

design <- model.matrix(~0 + Dose, dge$samples)
head(design)
##            DoseDMSO Dose100nM Dose1uM Dose10uM Dose100uM
## AACGTCAGCC        0         1       0        0         0
## ACAGTAGCTC        0         0       0        0         1
## ATCAAGAACG        0         0       0        1         0
## CAATTGTTCG        0         1       0        0         0
## CATGTACTTG        0         1       0        0         0
## CCAAGACGTG        0         0       0        1         0
dge <- estimateDisp(dge, design)
summary(dge$trended.dispersion)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.01203 0.01421 0.02015 0.02395 0.03243 0.04630

Create the contrast matrix. Foucs on the largest changes.

contr.matrix <- makeContrasts(
   Dose100uM = Dose100uM - DoseDMSO,
   Dose10uM = Dose10uM - DoseDMSO, 
   Dose1uM = Dose1uM - DoseDMSO, 
   Dose100nM = Dose100nM - DoseDMSO, 
   Dose100uM_vs_10uM = Dose100uM - Dose10uM,
   levels = colnames(design))

In each contrast, the format is A - B where:

  • A represents the condition considered as the “treatment” or point of interest
  • B represents the condition considered as the “control” or baseline

Fit BCV

par(mfrow=c(1,2))
v <- voom(dge, design, plot=TRUE)

vfit <- lmFit(v, design)
vfit <- contrasts.fit(vfit, contrasts=contr.matrix)
efit <- eBayes(vfit)
plotSA(efit, main="Final model: Mean-variance trend")

Differential expression analysis

Check how many genes are differentially expressed. Fewer genes at lower doses which makes sense.

summary(decideTests(efit))
##        Dose100uM Dose10uM Dose1uM Dose100nM Dose100uM_vs_10uM
## Down          26       49       0         0                 4
## NotSig     11606    11520   11651     11653             11649
## Up            21       84       2         0                 0

Visualise differentially expressed genes

100uM Dose

hundred_uM <- topTable(efit, coef=1, n=length(efit$genes$ID), sort.by = "logFC")
results <- as_tibble(hundred_uM)
results$ID <- rownames(hundred_uM)

# add a column of NAs
results$DElabel <- "n/s"
# if log2Foldchange > 1 or < -1 and pvalue < 0.05, set as "UP"
results$DElabel[results$logFC > 1 & results$adj.P.Val < 0.1] <- "TMZ"
results$DElabel[results$logFC < -1 & results$adj.P.Val < 0.1] <- "DMSO"
results_hundred <- results

Add gene labels

results$genelabels <- ""
results$genelabels[results$Symbol == "GDF15"] <- "GDF15"
results$genelabels[results$Symbol == "FAS"] <- "FAS"
results$genelabels[results$Symbol == "IGFBP5"] <- "IGFBP5"
results$genelabels[results$Symbol == "NEAT1"] <- "NEAT1"
results$genelabels[results$Symbol == "CDKN1A"] <- "CDKN1A"
results$genelabels[results$Symbol == "PDGFRA"] <- "PDGFRA"
results$genelabels[results$Symbol == "HMGCS1"] <- "HMGCS1"
results$genelabels[results$Symbol == "SCN1A"] <- "SCN1A"

Volcano

results$DElabel <- factor(results$DElabel, levels = c("TMZ", "n/s", "DMSO"))  # Adjust as per your actual labels

plt1 <- ggplot(data = results, aes(x = logFC, y = -log10(adj.P.Val), colour = DElabel, label = genelabels)) + 
  geom_point(alpha = 0.33, size = 1.5) +
  geom_text_repel(size = 4, colour = "black") +
  guides(colour = guide_legend(override.aes = list(size = 3, alpha = 1))) +
  scale_color_manual(
    values = c("darkorange", "grey", "darkblue"),  # Colors in the desired order
    name = "Upregulated",
    labels = c("TMZ", "n/s", "DMSO")  # Optional: Add custom labels
  ) +
  geom_vline(xintercept = 1, linetype = "dotted") + 
  geom_vline(xintercept = -1, linetype = "dotted") +
  theme_Publication()

plt1

Generate heatmap

dge$logCPM <- edgeR::cpm(dge, log=TRUE, prior.count=1)
dge_subset <- dge[,dge$samples$Dose == "DMSO" | dge$samples$Dose == "100uM"]
create_gene_heatmap(de_results_df = results, dge_object = dge_subset, font_size = 12, num_genes=10)

Gene set testing with camera

Need to convert geneIDs from ensembl to enterez

geneids <- as.data.frame(v$genes$ID)
colnames(geneids) <- "ENSEMBL"

geneids$entrez <- mapIds(org.Hs.eg.db, keys = geneids$ENSEMBL, keytype = "ENSEMBL", column = "ENTREZID")

Nothing surprising here, cell ycle goes down p53 goes up though very significant.

load("data/MSigDB/human_H_v5p2.rdata")
idx <- ids2indices(Hs.H,identifiers = geneids$entrez)
cam.100uM <- camera(v,idx,design,contrast=contr.matrix[,1])
head(cam.100uM,10)

Visualize the gene set testing.

par(mfrow=c(1,1))

barcodeplot(efit$t[,1], index=idx$HALLMARK_P53_PATHWAY,
            index2 = idx$HALLMARK_E2F_TARGETS)

Visualize as a barplot

geom_GeneSet_Barchart(cam.100uM)

10uM Dose

ten_uM <- topTable(efit, coef=2, n=length(efit$genes$ID), sort.by = "logFC")
results <- as_tibble(ten_uM)
results$ID <- rownames(ten_uM)

# add a column of NAs
results$DElabel <- "n/s"
# if log2Foldchange > 1 or < -1 and pvalue < 0.05, set as "UP"
results$DElabel[results$logFC > 1 & results$adj.P.Val < 0.1] <- "TMZ"
results$DElabel[results$logFC < -1 & results$adj.P.Val < 0.1] <- "DMSO"
results_ten <- results

Add gene labels

results$genelabels <- ""
results$genelabels[results$Symbol == "GDF15"] <- "GDF15"
results$genelabels[results$Symbol == "FAS"] <- "FAS"
results$genelabels[results$Symbol == "CDKN1A"] <- "CDKN1A"
results$genelabels[results$Symbol == "PDGFRA"] <- "PDGFRA"
results$genelabels[results$Symbol == "FADS2"] <- "FADS2"
results$genelabels[results$Symbol == "MDM2"] <- "MDM2"
results$genelabels[results$Symbol == "AL365181.3"] <- "AL365181.3"

Volcano

results$DElabel <- factor(results$DElabel, levels = c("TMZ", "n/s", "DMSO"))  # Adjust as per your actual labels

plt2 <- ggplot(data = results, aes(x = logFC, y = -log10(adj.P.Val), colour = DElabel, label = genelabels)) + 
  geom_point(alpha = 0.33, size = 1.5) +
  geom_text_repel(size = 4, colour = "black") +
  guides(colour = guide_legend(override.aes = list(size = 3, alpha = 1))) +
  scale_color_manual(
    values = c("darkorange", "grey", "darkblue"),  # Colors in the desired order
    name = "Upregulated",
    labels = c("TMZ", "n/s", "DMSO")  # Optional: Add custom labels
  ) +
  geom_vline(xintercept = 1, linetype = "dotted") + 
  geom_vline(xintercept = -1, linetype = "dotted") +
  theme_Publication()

plt2

Generate heatmap

dge$logCPM <- edgeR::cpm(dge, log=TRUE, prior.count=1)
dge_subset <- dge[,dge$samples$Dose == "DMSO" | dge$samples$Dose == "10uM"]
create_gene_heatmap(de_results_df = results, dge_object = dge_subset, font_size = 12, num_genes=10)

Gene set testing with camera

Same pathways as 100uM.

cam.10uM<- camera(v,idx,design,contrast=contr.matrix[,2])
head(cam.10uM,10)

Generate gene set barchart

geom_GeneSet_Barchart(cam.10uM)

Compare 100uM Dose with 10uM dose

With the spline analysis of dose in notebook 3A_dose_spline there is a maximum gene expression effect at 10uM that declines at 100uM.
My interpretation is that cells are adapting to TMZ up to the 10uM dose then most are being killed with only the survivors being alive and available to have their transcriptome sequenced at 100uM dose.

So this comparison covers the small set of surviving cells at 100uM vs the majority of adapted cells at 10uM.

hundred_v_ten <- topTable(efit, coef=5, n=length(efit$genes$ID), sort.by = "logFC")
results <- as_tibble(hundred_v_ten)
results$ID <- rownames(hundred_v_ten)

# add a column of NAs
results$DElabel <- "n/s"
# if log2Foldchange > 1 or < -1 and pvalue < 0.05, set as "UP"
results$DElabel[results$logFC > 1 & results$adj.P.Val < 0.1] <- "100uM"
results$DElabel[results$logFC < -1 & results$adj.P.Val < 0.1] <- "10uM"

Add gene labels

results$genelabels <- ""
results$genelabels[results$Symbol == "MDM2"] <- "MDM2"
results$genelabels[results$Symbol == "E2F7"] <- "E2F7"
results$genelabels[results$Symbol == "DDIT3"] <- "DDIT3"
results$genelabels[results$Symbol == "PTP4A1"] <- "PTP4A1"

Volcano

Not many differentially expressed genes here. Likely the gene expression is pretty subtle.

# Update the plot
plt2 <- ggplot(data = results, aes(x = logFC, y = -log10(adj.P.Val), colour = DElabel, label = genelabels)) + 
  geom_point(alpha = 1, size = 1.5) +
  geom_text_repel(size = 4, colour = "black") +
  guides(colour = guide_legend(override.aes = list(size = 3, alpha = 1))) +
  scale_color_manual(
    values = c("darkblue", "grey"),
    name = "Upregulated"
  ) +
  geom_vline(xintercept = 1, linetype = "dotted") + 
  geom_vline(xintercept = -1, linetype = "dotted") +
  theme_Publication()

plt2

Gene set testing with camera

Clearly enriched p53 pathway in the 10uM dose (left). On the right 100uM, likely stemness phenotype which is consistent with the literature.

cam.100_vs_10uM <- camera(v,idx,design,contrast=contr.matrix[,5])
head(cam.100_vs_10uM,10)

Generate gene set barchart

geom_GeneSet_Barchart(cam.100_vs_10uM)

Venn digram 100uM vs 10uM dose

All the genes are in common.

hundred_v_dmso <- topTable(efit, coef=1, n=length(efit$genes$ID), sort.by = "logFC", p.value = 0.05, lfc=1)
ten_v_dmso <- topTable(efit, coef=2, n=length(efit$genes$ID), sort.by = "logFC", p.value = 0.05, lfc=1)


venn_list <- list(
  "100uM vs DMSO" = row.names(hundred_v_dmso), 
  "10uM vs DMSO" = row.names(ten_v_dmso)
  )

ggvenn(venn_list,
       show_elements = F, label_sep = "\n",
       text_size = 8,
       show_percentage = FALSE,
       fill_color = c("navy", "springgreen4")
       )

What are the 8 genes that are DE in 100uM vs DMSO?

# Extract the gene IDs from both data frames
genes_hundred <- hundred_v_dmso$ID
genes_ten <- ten_v_dmso$ID

# Find genes that are in hundred_v_dmso but not in ten_v_dmso
unique_genes <- setdiff(genes_hundred, genes_ten)

# To get the rows from hundred_v_dmso that are not in ten_v_dmso
hundred_v_dmso[hundred_v_dmso$ID %in% unique_genes, ]$Symbol
## [1] "RNF26"     "MELK"      "HIST1H1E"  "TMEM97"    "CDCA7"     "KIAA1549L"
## [7] "TUBA1B"    "SRSF1"

Session info

sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Red Hat Enterprise Linux 9.5 (Plow)
## 
## Matrix products: default
## BLAS:   /stornext/System/data/software/rhel/9/base/tools/R/4.4.1/lib64/R/lib/libRblas.so 
## LAPACK: /stornext/System/data/software/rhel/9/base/tools/R/4.4.1/lib64/R/lib/libRlapack.so;  LAPACK version 3.12.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Australia/Melbourne
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] grid      stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] ggthemes_5.1.0              here_1.0.1                 
##  [3] patchwork_1.3.0             ggvenn_0.1.10              
##  [5] viridis_0.6.5               viridisLite_0.4.2          
##  [7] pheatmap_1.0.12             ggrepel_0.9.6              
##  [9] scater_1.32.1               org.Hs.eg.db_3.19.1        
## [11] AnnotationDbi_1.66.0        RColorBrewer_1.1-3         
## [13] edgeR_4.2.2                 limma_3.60.6               
## [15] biomaRt_2.60.1              scuttle_1.14.0             
## [17] lubridate_1.9.3             forcats_1.0.0              
## [19] stringr_1.5.1               dplyr_1.1.4                
## [21] purrr_1.0.2                 readr_2.1.5                
## [23] tidyr_1.3.1                 tibble_3.2.1               
## [25] ggplot2_3.5.1               tidyverse_2.0.0            
## [27] SingleCellExperiment_1.26.0 SummarizedExperiment_1.34.0
## [29] Biobase_2.64.0              GenomicRanges_1.56.2       
## [31] GenomeInfoDb_1.40.1         IRanges_2.38.1             
## [33] S4Vectors_0.42.1            BiocGenerics_0.50.0        
## [35] MatrixGenerics_1.16.0       matrixStats_1.4.1          
## 
## loaded via a namespace (and not attached):
##  [1] rstudioapi_0.17.0         jsonlite_1.8.9           
##  [3] magrittr_2.0.3            ggbeeswarm_0.7.2         
##  [5] farver_2.1.2              rmarkdown_2.28           
##  [7] zlibbioc_1.50.0           vctrs_0.6.5              
##  [9] memoise_2.0.1             DelayedMatrixStats_1.26.0
## [11] htmltools_0.5.8.1         S4Arrays_1.4.1           
## [13] progress_1.2.3            curl_5.2.3               
## [15] BiocNeighbors_1.22.0      SparseArray_1.4.8        
## [17] sass_0.4.9                bslib_0.8.0              
## [19] httr2_1.0.5               cachem_1.1.0             
## [21] igraph_2.0.3              lifecycle_1.0.4          
## [23] pkgconfig_2.0.3           rsvd_1.0.5               
## [25] Matrix_1.7-0              R6_2.5.1                 
## [27] fastmap_1.2.0             GenomeInfoDbData_1.2.12  
## [29] digest_0.6.37             colorspace_2.1-1         
## [31] rprojroot_2.0.4           dqrng_0.4.1              
## [33] irlba_2.3.5.1             RSQLite_2.3.7            
## [35] beachmat_2.20.0           labeling_0.4.3           
## [37] filelock_1.0.3            fansi_1.0.6              
## [39] timechange_0.3.0          httr_1.4.7               
## [41] abind_1.4-8               compiler_4.4.1           
## [43] bit64_4.5.2               withr_3.0.1              
## [45] BiocParallel_1.38.0       DBI_1.2.3                
## [47] highr_0.11                rappdirs_0.3.3           
## [49] DelayedArray_0.30.1       bluster_1.14.0           
## [51] tools_4.4.1               vipor_0.4.7              
## [53] beeswarm_0.4.0            glue_1.8.0               
## [55] cluster_2.1.6             generics_0.1.3           
## [57] gtable_0.3.5              tzdb_0.4.0               
## [59] hms_1.1.3                 metapod_1.12.0           
## [61] BiocSingular_1.20.0       ScaledMatrix_1.12.0      
## [63] xml2_1.3.6                utf8_1.2.4               
## [65] XVector_0.44.0            pillar_1.9.0             
## [67] splines_4.4.1             BiocFileCache_2.12.0     
## [69] lattice_0.22-6            bit_4.5.0                
## [71] tidyselect_1.2.1          locfit_1.5-9.10          
## [73] Biostrings_2.72.1         knitr_1.48               
## [75] gridExtra_2.3             xfun_0.48                
## [77] statmod_1.5.0             stringi_1.8.4            
## [79] UCSC.utils_1.0.0          yaml_2.3.10              
## [81] evaluate_1.0.1            codetools_0.2-20         
## [83] cli_3.6.3                 munsell_0.5.1            
## [85] jquerylib_0.1.4           Rcpp_1.0.13              
## [87] dbplyr_2.5.0              png_0.1-8                
## [89] parallel_4.4.1            blob_1.2.4               
## [91] prettyunits_1.2.0         scran_1.32.0             
## [93] sparseMatrixStats_1.16.0  scales_1.3.0             
## [95] crayon_1.5.3              rlang_1.1.4              
## [97] KEGGREST_1.44.1